

###########################################################
##### Haiti elite network project  		          			#####
##### create family-level network vars        				#####
##### 2021 mar 03                   									#####
###########################################################


## 1. collapse network data to fam level
## 2. merge in family characteristics
## 3. make igraph object
## 4. calculate network stats


#####
## prepare family network data
#####

##
## read network data into graph form
##

## set period
# period <- 1950
# period <- 1925
period <- 'all'

## read in the ps edgelist 

ps_graph <- read.graph('01_Data/02_Clean/gene_ps_all.graphml',format='graphml')


##
## subset to only the right group of the network
##

ps_mat <- get.data.frame(ps_graph, what=c('edges'))
ps_att <- get.data.frame(ps_graph, what=c('vertices'))


## subset to teh right time period

if (period == 1950) {
  ps_att <- subset(ps_att, ps_att$cohort >= 1850 & ps_att$cohort <= 1950)
  ps_mat <- subset(ps_mat, ps_mat$cohort25 >= "(1850,1875]" & ps_mat$cohort25 <= "(1925,1950]")
}
if (period == 1925) {
  ps_att <- subset(ps_att, ps_att$cohort >= 1850 & ps_att$cohort <= 1925)
  ps_mat <- subset(ps_mat, ps_mat$cohort25 >= "(1850,1875]" & ps_mat$cohort25 <= "(1900,1925]")
}
if (period == "all"){
  ps_att <- subset(ps_att, (ps_att$cohort >= 1850 & ps_att$cohort <= 1975))
  ps_mat <- subset(ps_mat, (ps_mat$cohort25 >= "(1850,1875]" & ps_mat$cohort25 <= "(1950,1975]"))
}



##
## collapse to fam
##

# create attribute matrix for ego
fam_mat <- subset(ps_mat, select = c(from, to, cohort15, cohort25, cohort10, parent, spouse))
fam_mat <- merge(fam_mat, ps_att, by.x='from', by.y='name', all.x=T)

# subset to non-zero spouse edges
fam_mat <- subset(fam_mat, fam_mat$spouse!=0, select = c("from", "to", "family", "cohort"))

# merge in family name of alter
setnames(fam_mat, c('family', 'cohort'), c('family_e', 'cohort_e'))
fam_mat <- merge(fam_mat, ps_att, by.x = "to", by.y = "name", all.x = T)
fam_mat <- subset(fam_mat, select = c("family_e", "cohort_e", "family", "cohort"))
setnames(fam_mat, c('family', 'cohort'), c('family_a', 'cohort_a'))

# get rid of people with family name "NA"
fam_mat <- subset(fam_mat, fam_mat$family_a!="NA")
fam_mat <- subset(fam_mat, fam_mat$family_e!="NA")

## pull out list of unique families
fam_att <- data.frame("family" = unique(ps_att$family))


#####
## merge family characteristics into network data
#####

##
## merge biz info into gen
##

own <- read.dta("01_Data/02_Clean/fam_biz_prod.dta")
own <- data.table(own[own$fam!="NA",])
own <- own[, lapply(.SD, sum, na.rm=T), 
           .SDcols = c('value_1', 'value_wo_02', 'value_ha_02', 'value_wo_0212', 
                       'value_wo_autoc', 'value_log', 'value_log0212', 'value_logautoc',
                       'value_bin', 'value_bin0212', 'value_binautoc'),
           by = 'fam']
own$biz <- 1


## merge in to fam network data

fam_att <- merge(fam_att, own, by.x = "family", by.y = 'fam', all.x = T)
table(na.omit(fam_att$biz))/dim(fam_att)[1]   # proportion of elite fams who are biz-owners
own$fam[!(own$fam %in% fam_att$family)]   # biz owning fams who are not in gen data
own$fam[(own$fam %in% fam_att$family)]   # biz owning fams who are in gen daa
length(own$fam[(own$fam %in% fam_att$family)])/length(own$fam)   # prop of biz owners who are matched in gen data



##
## merge political variables into family gen data
##

## supplice pol bios
pol <- read.csv("01_Data/02_Clean/polbios_fam.csv", as.is=T)
pol$X = NULL

fam_att <- merge(fam_att, pol, by.x = "family", by.y = "name_last", all.x = T)

## make dummy if at least one fam member served in legis, exec, or judic
vars <- colnames(fam_att)[which(colnames(fam_att)=="legis"):which(colnames(fam_att)=="pol_duvpre1")]
for (i in vars){
  fam_att[,vars] <- ifelse(fam_att[,vars] >= 1, 1, 0)
}


##
## add other measures
##

## OFAC coups
coup <- read.csv("01_Data/02_Clean/coup_fams.csv", as.is = T)
coup$X = NULL
setnames(coup, c('last_name', 'coup_bi'), c('family', 'coup'))
fam_att <- merge(fam_att, coup, by = "family", all.x = T)

## immigration
book <- read.csv("01_Data/02_Clean/all_immig.csv")
book$X=NULL
fam_att <- merge(fam_att, book, by.x = "family", by.y = "fam", all.x = T)

## NA -> 0
vars <- colnames(fam_att)[which(colnames(fam_att)=="biz"):which(colnames(fam_att)=="african")]
fam_att[,vars] <- apply(fam_att[,vars], 2, function(x) {x <- car::recode(x,"NA=0"); x})


##
## final steps
##

## check for duplicates
fam_att[duplicated(fam_att$family)==T,]
fam_att <- subset(fam_att, duplicated(fam_att$family)==F)

## create a value with zeros for non-econ elites
fam_att$value_2 <- ifelse(is.na(fam_att$value_1)==T, 0, fam_att$value_1)
fam_att$value_log_nona2 <- log(fam_att$value_2 + 1)
fam_att$value_wo_02_nona <- ifelse(is.na(fam_att$value_wo_02)==T, 0, fam_att$value_wo_02)
fam_att$value_ha_02_nona <- ifelse(is.na(fam_att$value_ha_02)==T, 0, fam_att$value_ha_02)
fam_att$value_wo_0212_nona <- ifelse(is.na(fam_att$value_wo_0212)==T, 0, fam_att$value_wo_0212)
fam_att$value_wo_autoc_nona <- ifelse(is.na(fam_att$value_wo_autoc)==T, 0, fam_att$value_wo_autoc)
fam_att$value_log_nona <- ifelse(is.na(fam_att$value_log)==T, 0, fam_att$value_log)
fam_att$value_log0212_nona <- ifelse(is.na(fam_att$value_log0212)==T, 0, fam_att$value_log0212)
fam_att$value_logautoc_nona <- ifelse(is.na(fam_att$value_logautoc)==T, 0, fam_att$value_logautoc)
fam_att$value_bin_nona <- ifelse(is.na(fam_att$value_bin)==T, 0, fam_att$value_bin)
fam_att$value_bin0212_nona <- ifelse(is.na(fam_att$value_bin0212)==T, 0, fam_att$value_bin0212)
fam_att$value_binautoc_nona <- ifelse(is.na(fam_att$value_binautoc)==T, 0, fam_att$value_binautoc)

## scale value to fall between 0 and 1
fam_att$value_log_nona_scaled <- (fam_att$value_log_nona - min(fam_att$value_log_nona))/(max(fam_att$value_log_nona) - min(fam_att$value_log_nona))
fam_att$value_log_nona_rank <- rank(fam_att$value_log_nona)
fam_att$value_log_nona_bin <- car::recode(fam_att$value_log_nona, "0=0;NA=NA;else=1")
m <- min(fam_att$value_log_nona[fam_att$value_log_nona>0], na.rm=T)
fam_att$value_log_nona_pos10 <- car::recode(fam_att$value_log_nona, "0:m=m")


## write family attributes to csv
if (period == "all"){
  write.csv(fam_att, "01_Data/02_Clean/family_attributes.csv")
}


#####
## set up graph
#####

fam_mat <- cbind.data.frame('ego'=fam_mat$family_e, "alter"=fam_mat$family_a, 
                            'spouse_tie' = rep(1, dim(fam_mat)[1]), 'cohort'=fam_mat$cohort_e)

## make into graph object
fam_graph <- graph.data.frame(fam_mat,vertices=fam_att,directed=T)


# make undirected
fam_graph <- as.undirected(fam_graph, mode = 'collapse', edge.attr.comb='first')


## write fam graph and elite graph to graph objects

if (period == 1950) {
  write.graph(fam_graph,'01_Data/02_Clean/fam_graph_1950.graphml',format='graphml')
}
if (period == 1925) {
  write.graph(fam_graph,'01_Data/02_Clean/fam_graph_1925.graphml',format='graphml')
}
if (period != 1925 & period != 1950) {
  write.graph(fam_graph,'01_Data/02_Clean/fam_graph.graphml',format='graphml')
}


#####
## calculate network stats
#####

##
## write function for bonancich centrality
##

## calculate bonacich centrality (wiht optoinal weights by nodes)
bonpow2 <- function(adj, beta, nodew = 1){
  solve(diag(dim(adj)[1]) - (beta)*adj) %*% matrix(nodew, nrow(adj), 1)
}

## calculate M matrix
makeMmatrix <- function(adj, beta){
  solve(diag(dim(adj)[1]) - (beta)*adj)
}

## calculate bonpow centrality
bonpowM <- function(M, nodew = 1){
  M %*% matrix(nodew, nrow(M), 1)
}



##
## unweighted network stats
##


## plain degree - # of nodes directly connected by marriage ties
t <- degree(fam_graph)
fam_cent <- cbind.data.frame("family" = rownames(as.data.frame(t)), "degree_all_uw" = t)


## unweighted bonacich centrality

fam_adj_uw <- get.adjacency(fam_graph)

eigens <- eigen(fam_adj_uw)
e <- max(eigens$values)

M_uw_02 <- makeMmatrix(adj = fam_adj_uw, beta = 0.2*(1/e))
bon_02_uw <- bonpowM(M = M_uw_02)

t <- data.frame('family' = fam_att$family, 
                'bon_02_uw' = matrix(bon_02_uw))
fam_cent <- merge(fam_cent, t, by = "family", all = T)



##### 
## number of individuals in time period by family
#####

fams = na.omit(unique(ps_att$family))
nind <- data.frame('fam' = rep(NA, length(fams)), 'nind' = rep(NA, length(fams)))

for (i in 1:length(fams)){
  attributes <- ps_att[ps_att$family == fams[i],]
  nind$nind[i] <- dim(attributes)[1]
  nind$fam[i] <- unique(attributes$fam)
}

if(period=="all"){
  fam_cent <- merge(fam_cent, nind, by.x = "family", by.y = 'fam', all = T)
}



##
## value weighted nodes (no edge weights)
##

## (using same unweighted eigenvalue as before)
bonw_02_uw <- bonpowM(M = M_uw_02, nodew = fam_att$value_log_nona)

t <- cbind.data.frame('family' = fam_att$family,
                      'bonw_02_uw' = as.vector(bonw_02_uw))
fam_cent <- merge(fam_cent, t, by = "family", all = T)




##
## family size edge weights measures
## 

## weighted bonacich taking family size into account

nind <- nind[order(nind$fam),]

fam_graph <- set.vertex.attribute(fam_graph, 'nind', index=V(fam_graph), nind$nind)
fam_edge <- get.edgelist(fam_graph)
fam_edge <- data.frame('fam_e' = fam_edge[,1], 'fam_a' = fam_edge[,2])
fam_edge <- merge(fam_edge, data.frame('fam_e' = nind$fam, 'nind_e' = nind$nind), by = 'fam_e', all.x = T)
fam_edge <- merge(fam_edge, data.frame('fam_a' = nind$fam, 'nind_a' = nind$nind), by = 'fam_a', all.x = T)
fam_edge$nind_w <- 1/(fam_edge$nind_e*fam_edge$nind_a)^(1/2)
fam_edge$nind_wsum <- 1/((fam_edge$nind_e+fam_edge$nind_a)/2)

fam_graph <- set.edge.attribute(fam_graph, 'nind_w', index=E(fam_graph), fam_edge$nind_w)
fam_graph <- set.edge.attribute(fam_graph, 'nind_wsum', index=E(fam_graph), fam_edge$nind_wsum)

## export adjacency matrix with weights
fam_adj_wnind <- get.adjacency(fam_graph, attr='nind_w')
fam_att_wnind <- get.data.frame(fam_graph, what=c('vertices'))

## get adjacency and take max eigenvalue
eigens <- eigen(fam_adj_wnind)
e <- max(eigens$values)
min(eigens$values)

## calculate nind-weighted centrality
M_wnind_02 <- makeMmatrix(adj = fam_adj_wnind, beta = 0.2*(1/e))
bon_02_wnind <- bonpowM(M = M_wnind_02)

t <- cbind.data.frame('family' = fam_att_wnind$name, 
                      'bon_02_wnind' = as.vector(bon_02_wnind))
fam_cent <- merge(fam_cent, t, by = "family", all = T)


## degree
t <- graph.strength(fam_graph, weights = E(fam_graph)$nind_w)
t <- cbind.data.frame('family' = rownames(as.data.frame(t)), "degree_all_wnind" = t)
fam_cent <- merge(fam_cent, t, by = "family", all = T)


## degree in teh network of coup participators 
coup <- V(fam_graph)[V(fam_graph)$coup==1]
nocoup <- V(fam_graph)[V(fam_graph)$coup==0]

coup_graph <- delete.vertices(fam_graph, V(fam_graph)[ nocoup ])
degree_cc <- graph.strength(coup_graph, weights = E(coup_graph)$nind_w)
coupnc_graph <- delete.edges(fam_graph, E(fam_graph)[ !(coup %--% nocoup) ])
degree_cnc <- graph.strength(coupnc_graph, weights = E(coupnc_graph)$nind_w)

degree_cc <- cbind.data.frame('family' = rownames(as.data.frame(degree_cc)),
                              'degree_cc' = degree_cc)
degree_cnc <- cbind.data.frame('family' = rownames(as.data.frame(degree_cnc)),
                               'degree_cnc' = degree_cnc)
t <- cbind.data.frame('family' = V(fam_graph)$name, 'coup' = V(fam_graph)$coup)

t <- merge(t, degree_cc, by = 'family', all = T)
t <- merge(t, degree_cnc, by = 'family', all = T)

t$degree_coupall_wnind <- ifelse(t$coup==1, t$degree_cc, t$degree_cnc)
t <- subset(t, select = c(family, degree_coupall_wnind))
fam_cent <- merge(fam_cent, t, by = "family", all = T)




##
## value and family size-weighted measures
##

## weighted bonacich taking family size into account (using same weighted eigenvalue as previous)
bonw_02_wnind <- bonpowM(M = M_wnind_02, nodew = fam_att_wnind$value_log_nona)
bonwr_02_wnind <- bonpowM(M = M_wnind_02, nodew = fam_att$value_log_nona_rank)
bonwp_02_wnind <- bonpowM(M = M_wnind_02, nodew = fam_att$value_log_nona_pos10)
bonwb_02_wnind <- bonpowM(M = M_wnind_02, nodew = fam_att$value_log_nona_bin)


for (i in seq(0,1,0.1)){
  t <- bonpowM(M = makeMmatrix(adj = fam_adj_wnind, beta = (1/e*i)), nodew = fam_att_wnind$value_log_nona)
  assign(paste0('bonw', i*10, '_wnind'), t)
}

t <- cbind.data.frame('family' = fam_att_wnind$name, 
                      'bonw_02_wnind' = as.vector(bonw_02_wnind),
                      'bonwr_02_wnind' = as.vector(bonwr_02_wnind),
                      'bonwp_02_wnind' = as.vector(bonwp_02_wnind),
                      'bonwb_02_wnind' = as.vector(bonwb_02_wnind),
                      'bonw0_wnind' = as.vector(bonw0_wnind),
                      'bonw1_wnind' = as.vector(bonw1_wnind),
                      'bonw2_wnind' = as.vector(bonw2_wnind),
                      'bonw3_wnind' = as.vector(bonw3_wnind),
                      'bonw4_wnind' = as.vector(bonw4_wnind),
                      'bonw5_wnind' = as.vector(bonw5_wnind),
                      'bonw6_wnind' = as.vector(bonw6_wnind),
                      'bonw7_wnind' = as.vector(bonw7_wnind),
                      'bonw8_wnind' = as.vector(bonw8_wnind),
                      'bonw9_wnind' = as.vector(bonw9_wnind),
                      'bonw10_wnind' = as.vector(bonw10_wnind))
fam_cent <- merge(fam_cent, t, by = "family", all = T)



## 
## alternate node weights
##


bonwbin_02_wnind <- bonpowM(M = M_wnind_02, nodew = fam_att_wnind$value_bin_nona)
bonwbinautoc_02_wnind <- bonpowM(M = M_wnind_02, nodew = fam_att$value_binautoc_nona)
bonw0212_02_wnind <- bonpowM(M = M_wnind_02, nodew = fam_att$value_log0212_nona)
bonwautoc_02_wnind <- bonpowM(M = M_wnind_02, nodew = fam_att$value_logautoc_nona)


t <- cbind.data.frame("family" = fam_att_wnind$name, 
                      'bonwbin_02_wnind' = matrix(bonwbin_02_wnind),
                      'bonw0212_02_wnind' = matrix(bonw0212_02_wnind),
                      'bonwautoc_02_wnind' = matrix(bonwautoc_02_wnind))
fam_cent <- merge(fam_cent, t, by = "family", all = T)



##
## node weights with own value set to 1
##

t <- data.frame('fam'=rownames(fam_adj_wnind),
                'bonw_02_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw0_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw1_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw2_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw3_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw4_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw5_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw6_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw7_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw8_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw9_wnind_own0'=rep(NA,length(bonw_02_wnind)),
                'bonw10_wnind_own0'=rep(NA,length(bonw_02_wnind)))
nodew <- fam_att_wnind$value_log_nona


for (j in seq(0,1,0.1)){
  M_temp = makeMmatrix(adj = fam_adj_wnind, beta = (1/e*j))
  for (i in 1:length(bonw_02_wnind)){
    nodew <- fam_att_wnind$value_log_nona
    nodew[i] <- 0
    own0 <- bonpowM(M = M_wnind_02, nodew = nodew)
    t$bonw_02_wnind_own0[i] <- own0[i]
    temp <- bonpowM(M = M_temp, nodew = nodew)
    t[i,j*10+3] <- temp[i]
    print(paste('j=',j,'i=',i))
  }
}

fam_cent <- merge(fam_cent, t, by.x = 'family', by.y = 'fam', all.x = T)



##
## community clusters
##

fg<- fastgreedy.community(fam_graph)

t <- cbind.data.frame('family' = fg$names,
                      'fastgreedy'=fg$membership)

fam_cent <- merge(fam_cent, t, by = "family", all = T)



## 
## alternate edge weights
##

## summing family sizes
fam_adj_wnind2 <- get.adjacency(fam_graph, attr='nind_wsum')
fam_att_wnind2 <- get.data.frame(fam_graph, what=c('vertices'))

eigens <- eigen(fam_adj_wnind2)
e <- max(eigens$values)

bonw_02_wnind2 <- bonpow2(adj = fam_adj_wnind2, beta = (1/e)*0.2, nodew = fam_att_wnind2$value_log_nona)

t <- cbind.data.frame("family" = fam_att_wnind2$name, 
                      'bonw_02_wnind2' = matrix(bonw_02_wnind2))
fam_cent <- merge(fam_cent, t, by = "family", all = T)




#####
## prob that people w same last name are connected by blood
#####

if (period != 1925 & period != 1950) {
  
  ps_all <- read.graph('01_Data/02_Clean/gene_ps_all.graphml', format = 'graphml')
  
  ps_mat_all <- get.data.frame(ps_all, what=c('edges'))
  ps_att_all <- get.data.frame(ps_all, what=c('vertices'))
  
  ps_mat_all <- merge(ps_mat_all, ps_att_all, by.x='from', by.y='name', all.x=T)
  fams = na.omit(unique(ps_mat_all$family))
  
  clanstats <- cbind.data.frame('fams'=fams,matrix(NA,length(fams),7))
  colnames(clanstats) <- c('fams','nclust','reachability','maxclust','oldest','newest',
                           'nind_all','qual')
  
  for (i in 1:length(fams)){
    
    last_mat <- subset(ps_mat_all,ps_mat_all$family==fams[i],
                       select=c('from','to','parent','spouse','cohort'))
    attributes <- ps_att_all[ps_att_all$name %in% last_mat$from | ps_att_all$name %in% last_mat$to,]
    last_graph <- graph.data.frame(last_mat, vertices=attributes)
    
    ## calc reachability and number of clusters by last name
    
    # number of clusters within a single last name
    clanstats[i,2] <- clusters(last_graph)$no
    
    # probability that two people w same last name are connected by blood
    reach <- geodist(get.adjacency(as.undirected(last_graph), sparse = F))$count
    clanstats[i,3] <- table(reach>0)['TRUE']/dim(reach)[1]^2
    
    # size of largest cluster as % of nodes
    clanstats[i,4] <- max(clusters(last_graph)$csize)/length(V(last_graph))
    
    # earliest cohort
    clanstats[i,5] <- min(get.vertex.attribute(last_graph,"cohort"),na.rm=T)
    
    # latest cohort
    clanstats[i,6] <- max(get.vertex.attribute(last_graph,"cohort"),na.rm=T)
    
    # number of members
    clanstats[i,7] <- length(V(last_graph))
    
    # overall data quality score
    clanstats[i,8] <- mean(c(dim(attributes[is.na(attributes$birth)==F,])[1]/dim(attributes)[1],
                             dim(attributes[is.na(attributes$death)==F,])[1]/dim(attributes)[1]),
                           na.rm=T)   
    
    print(i)
  }
  
  clantab <- rbind(summary(na.omit(clanstats$nclust)),
                   summary(na.omit(clanstats$reachability)),
                   summary(na.omit(clanstats$maxclust)),
                   summary(na.omit(clanstats$oldest)),
                   summary(na.omit(clanstats$newest)),
                   summary(na.omit(clanstats$nind_all)),
                   summary(na.omit(clanstats$qual)))
  rownames(clantab) <- c('Num clusters','Reachability','Biggest cluster',
                         'Oldest edge','Newest edge','Num inds','Qual')

}



#####
## merge it together
#####

if (period != 1950 & period != 1925) {
  fam_cent <- merge(fam_cent, clanstats, by.x = "family", by.y = "fams", all.x = T)
}

if (period == 1950 | period == 1925) {
  setnames(fam_cent, 
           colnames(fam_cent)[2:dim(fam_cent)[2]], 
           paste0(colnames(fam_cent)[2:dim(fam_cent)[2]], '.', period))
}


if (period == 1925) {
  write.csv(fam_cent, '01_Data/02_Clean/centrality_1925.csv')
}
if (period == 1950) {
  write.csv(fam_cent, '01_Data/02_Clean/centrality_1950.csv')
}
if (period != 1950 & period != 1925) {
  write.csv(fam_cent, '01_Data/02_Clean/centrality.csv')
}


  
  
  
  
  
  
